#!/usr/bin/python # -*- coding: UTF-8 -*- from __future__ import division from __future__ import absolute_import from __future__ import division, print_function, unicode_literals ########################### ### Autor: Sebastian Enger / M.Sc. ### Copyright: Sebastian Enger ### Licence: Commercial / OneTipp ### Version: 1.0.0c - 2-11-2015@23:53 Uhr ### Contact: sebastian.enger@gmail.com ### OneTipp Text Tool in Python: Main File ########################### """ Synonym bzw Wortersetzung parallelisieren für schnellere Verarbeitung und Reaktionszeit des Tools Antonym Datenbank Entwicklung mit Hilfe gecrawlter Websites Aufbau einer Datenbank mit einfacher deutscher Sprache Berechnung des lesbarkeitswerts eines eingabetextes - basierend auf einfachen Texten die "simple German " Datenbank für Austausch nutzen, Wissenschaftliche Texte mit Leipzig und unserer lokalen Synonym Datenbank austauschen Tests am 29.10.2015: https://github.com/rsennrich/clevertagger """ #https://docs.python.org/2/library/configparser.html import os import sys reload(sys) sys.setdefaultencoding("utf-8") sys.path.append('/home/onetipp/python/modules') os.environ['PYTHON_EGG_CACHE'] = '/home/compress/' import random import codecs import re import mod import stopwords import pprint import pattern.de from pattern.de import conjugate from pattern.de import INFINITIVE, PRESENT, PAST, SG, SUBJUNCTIVE from textblob_de import TextBlobDE as TextBlob from textblob_de import PatternTagger from textblob_de import TextBlobDE import treetaggerwrapper #cursorMysql = mod.mysql.cursor() noDoubleHash = set() re_match = r"(\?|\.|\!)" # Match: ". WORT" # # sent_tokenize_list = sent_tokenize(text) # # Summarize the text first and then work on it # tSumy = mod.summarizeText(text) # #tokens = mod.nltk.word_tokenize(tSumy) # tokens = mod.nltk.sent_tokenize(tSumy, language='german') # tokensRaw = mod.nltk.word_tokenize(text) #cursorMysql.execute("SELECT p_articletext FROM (publish_de) ORDER BY RAND() LIMIT 1;") #cursorMysql.execute("SELECT p_articletext FROM (publish_de) WHERE BINARY `id` = '%s' LIMIT 1;" % (word)) import re # https://perso.limsi.fr/pointal/doku.php?id=dev:treetaggerwrapper # https://subversion.renater.fr/ttpw/trunk/treetaggerwrapper.py # http://treetaggerwrapper.readthedocs.org/en/latest/#polls-of-taggers-process # result = cursorMysql.fetchall() # lies die Ein und Ausgabedateien inputfile = sys.argv[1] # read file into string # text = codecs.open(inputfile, "r", encoding='utf-8').read() text = codecs.open(inputfile, "r").read() tagger = treetaggerwrapper.TreeTagger(TAGLANG='de', TAGDIR='/home/onetipp/software/treetagger/') GermanStopwords = stopwords.getGermanStopwords() GermanSTTLIgnoreTags = stopwords.getSttsIgnoreTags() tokens = mod.nltk.sent_tokenize(text, language='german') #http://www.clips.ua.ac.be/pages/pattern-de list_conjugate = [ "VAFIN", "VVFIN", ] ListFinal = [] for s in tokens: if s is not None: #print("Satz: ", s) unicode_text = mod.safe_unicode(s) #tSumy = mod.summarizeText(r) tags = tagger.tag_text(unicode_text) tags2 = treetaggerwrapper.make_tags(tags) #pprint.pprint(tags2) for ele in tags2: if ele: word_tmp = ele[0] unicode_text = mod.safe_unicode(ele[0]) word = unicode_text.encode('utf-8') # Sie zeigt auf der Karte wo die Stadt Moskau ist. # Neu: Sie zeigt auf der Karte wo die Stadt Moskau wäre. #todo: if POS-TAG==NE and NEXT-POS-TAG == VV VFIN etc then VV or VVFIN bleibt so wie es ist. #pos_tag = ele[1].encode("ascii") pos_tag_tmp = ele[1] unicode_text = mod.safe_unicode(ele[1]) pos_tag = unicode_text.encode('utf-8') # print("
) Wort:", word, " > Pos:", pos_tag, "
") if pos_tag not in GermanStopwords and pos_tag not in GermanStopwords: # print("Pos tag to possible Change:",pos_tag) # print("Word:", word) if pos_tag in list_conjugate: conj_tmp = conjugate(word, PAST, 1, SG, mood=SUBJUNCTIVE) unicode_text = mod.safe_unicode(conj_tmp) conj = unicode_text.encode('utf-8') # print("Word Past: ", conj, " - Lenght: " ,len(ListFinal) ,"
") ListFinal.append("") ListFinal.append(conj) ListFinal.append("") continue else: # stopwordlist 1 # udata=word.decode("utf-8") # asciidata=udata.encode("ascii","ignore") ListFinal.append(word) #https://pypi.python.org/pypi/languagedet # file schreiben #readabilityVar = str(mod.textstat.flesch_reading_ease(text)) writeThis = " ".join(ListFinal) writeThis.encode('utf-8') with codecs.open("/tmp/onetipp_tmp.txt", 'wb+', encoding='utf-8') as f: f.write(writeThis) f.close() # mod.mysql.commit() # mod.mysql.close() # # mod.sphinx.commit() # mod.sphinx.close() exit(0) """ The Flesch Reading Ease formula function name - flesch_reading_ease(text) returns the Flesch Reading Ease Score. Following table is helpful to access the ease of readability in a document. 90-100 : Very Easy 80-89 : Easy 70-79 : Fairly Easy 60-69 : Standard 50-59 : Fairly Difficult 30-49 : Difficult 0-29 : Very Confusing """